1 Loading data and packages

library(readxl)
library(readr)
library(CTexploreR)
library(Vennerable)
library(biomaRt)
library(tidyverse)
library(SummarizedExperiment)
library(UpSetR)
library(ComplexHeatmap)
library(circlize)
library(SingleCellExperiment)
library(org.Hs.eg.db)
library(clusterProfiler)
library(msigdbr)
library(DOSE)
library(BiocParallel)
library(patchwork)
library(Biostrings)

Gene names/synonyms required for databases cleaning

ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
attributes_vector <- c("ensembl_gene_id", "external_gene_name",
                       "external_synonym", "gene_biotype",
                       "chromosome_name", "band", "start_position", "end_position",
                       "strand")
ensembl_gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))

ensembl_gene_synonym <- ensembl_gene_synonym %>%
  mutate(external_synonym = sub(pattern = "ORF", external_synonym, 
                                replacement = "orf"))

attributes_vector <- c("ensembl_gene_id", "external_gene_name")
ensembl_gene_names <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))

attributes_vector <- c("external_gene_name",
                       "external_synonym")
gene_synonym <- as_tibble(getBM(attributes = attributes_vector, mart = ensembl))
# CCLE_data <- CTdata::CCLE_data()
# TCGA_TPM <- CTdata::TCGA_TPM()
# testis_sce <- CTdata::testis_sce()
####

load("../CTdata/eh_data/CCLE_data.rda")
load("../CTdata/eh_data/TCGA_TPM.rda")
load("../CTdata/eh_data/testis_sce.rda")
load("../CTdata/eh_data/mean_methylation_in_tissues.rda")

load("../CTdata/eh_data/all_genes.rda")
load("../CTdata/eh_data/HPA_cell_type_specificities.rda")
load("../CTdata/eh_data/CT_genes.rda")

CT_and_CTP_genes <- CT_genes
CT_genes <- (filter(CT_genes, CT_gene_type == "CT_gene"))

Common figures parameters

legends_param <- list(
  labels_gp = gpar(col = "black", fontsize = 6),
  title_gp = gpar(col = "black", fontsize = 6),
  row_names_gp = gpar(fontsize = 4),
  annotation_name_side = "left")

legend_colors <- c("#5E4FA2", "#3288BD", "#66C2A5", "#ABDDA4", "#E6F598",
                   "#FFFFBF", "#FEE08B", "#FDAE61", "#F46D43", "#D53E4F",
                   "#9E0142")
chr_colors <- c("X-linked" = "deeppink", "Not X" = "royalblue1")

meth_colors <- c("Methylation" = "lightgreen", "Not methylation" = "gray")

2 Pipeline figure values

Here is the code used to have all the values in the pipeline figure

table(all_genes$GTEX_category)
## 
##     lowly_expressed               other testis_preferential     testis_specific 
##                2658               19741                 535                1568
table(all_genes$multimapping_analysis)
## 
## not_testis_specific testis_preferential     testis_specific 
##                2575                   7                  76
table(all_genes$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##               22316                 607                1579
specific_to_pref <- all_genes %>%
  filter(GTEX_category == "testis_specific" |
           multimapping_analysis == "testis_specific") %>% 
  filter(testis_specificity == "testis_preferential")
table(specific_to_pref$not_detected_in_somatic_HPA)
## 
## FALSE 
##    65
table(specific_to_pref$CCLE_category)
## 
##     activated         leaky not_activated 
##            26             1            38
table(specific_to_pref$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                 28                 10                  2                 25
all_genes %>% 
  filter(testis_specificity != "not_testis_specific") %>% 
  filter(CCLE_category == "activated" & 
           (TCGA_category == "multimapping_issue" | 
              TCGA_category == "activated")) %>% 
  pull(testis_specificity) %>% 
  table()
## .
## testis_preferential     testis_specific 
##                  70                 229
table(all_genes$CT_gene_type) #Before and after TSS verif
## 
##  CT_gene CTP_gene    other 
##      176       62    24264

3 Database comparison

3.1 Lists cleaning

CT lists from other databases have been checked (using GTEx and our GTEx_expression() funtion and GeneCards) in order to remove duplicated gene names or deprecated ones and allow comparison between databases.

3.1.1 CTdatabase

Online list copied in a csv file, several lists exist so we combined them.

We checked gene names that were a concatenation of two genes (choice using biomaRt synonyms to get the official one), checked which ones had the right names, removed duplicated genes, verified lost genes and added back those that should be there.

CTdatabase <- read_delim("data/CTdatabase1.csv", delim = ";", 
                         escape_double = FALSE, trim_ws = TRUE)
colnames(CTdatabase) <- c("Family", "Gene_Name", "Chromosomal_localization",
                          "CT_identifier")
CTdatabase_bis <- read_csv2("data/CTdatabase2.csv")
CTdatabase <- left_join(CTdatabase, CTdatabase_bis, 
                        by = c("Gene_Name" = "Gene_Symbol"))


CTdatabase_single <- CTdatabase %>%
  mutate(Gene_Name = sub(pattern = "/.*$", Gene_Name, replacement = ""))
CTdatabase_single <- CTdatabase_single %>%
  mutate(Gene_Name = sub(pattern = ",.*$", Gene_Name, replacement = ""))


CTdatabase_official_names <- 
  unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, 
                       external_gene_name)) %>%
  filter(external_gene_name %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA)
CTdatabase_synonym <- 
  ensembl_gene_synonym %>%
  filter(external_synonym %in% CTdatabase_single$Gene_Name) %>%
  mutate(Gene_Name = external_synonym) %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym)
CTdatabase_cleaned <- 
  rbind(CTdatabase_official_names, CTdatabase_synonym) %>% 
  left_join(CTdatabase_single)


duplicated_genes <- CTdatabase_cleaned$Gene_Name[duplicated(CTdatabase_cleaned$Gene_Name)]
bad_ids <- ensembl_gene_synonym %>%
  filter(external_gene_name %in% duplicated_genes | external_synonym %in% duplicated_genes) %>%
  filter(chromosome_name %in% grep(pattern = "H", x = chromosome_name, value = TRUE)) %>%
  pull(ensembl_gene_id)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  dplyr::filter(!ensembl_gene_id %in% bad_ids)
CTdatabase_cleaned <- CTdatabase_cleaned %>%
  filter(!ensembl_gene_id == "ENSG00000052126")
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!(ensembl_gene_id == "ENSG00000183305" & Gene_Name == "MAGEA2"))
CTdatabase_cleaned <- CTdatabase_cleaned %>% 
  filter(!ensembl_gene_id == "ENSG00000204648")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CSAG3B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CSAG2", "external_synonym"] <- "CSAG3B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT45A4")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CT45A3", "external_synonym"] <- "CT45A4"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "LAGE-1b")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAG2", "external_synonym"] <- "LAGE-1b"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CT16.2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "PAGE5", "external_synonym"] <- "CT16.2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXB2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXB1", "external_synonym"] <- "SPANXB2"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "SPANXE")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "SPANXD", "external_synonym"] <- "SPANXE"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1C")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1D")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE1E")
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "XAGE2B")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "XAGE2", "external_synonym"] <- "XAGE2B"
CTdatabase_cleaned <- filter(CTdatabase_cleaned, Gene_Name != "CTAGE-2")
CTdatabase_cleaned[CTdatabase_cleaned$Gene_Name == "CTAGE1", "external_synonym"] <- "CTAGE-2"


CTdatabase_cleaned <- ensembl_gene_synonym %>%
  mutate(Gene_Name = external_synonym) %>%
  filter(external_synonym == "CXorf61") %>%
  dplyr::select(ensembl_gene_id, external_gene_name, Gene_Name, external_synonym) %>%
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "Cxorf61", 
                    c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name)) %>%
  filter(external_gene_name == "CCNA1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "cyclin A1", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "GOLGA6L2") %>%
  filter(ensembl_gene_id == "ENSG00000174450") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "GOLGAGL2 FA", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LYPD6B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC130576", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT62") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC196993", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "CT75") %>%
  filter(ensembl_gene_id == "ENSG00000291155") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC440934", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "LINC01192") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC647107", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "TSPY1") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "LOC728137", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)
CTdatabase_cleaned <- unique(dplyr::select(ensembl_gene_synonym, ensembl_gene_id, external_gene_name))%>%
  filter(external_gene_name == "SSX2B") %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA) %>% 
  cbind(CTdatabase_single[CTdatabase_single$Gene_Name == "SSX2b", 
                          c("Family", "Chromosomal_localization", "CT_identifier", "Classification")]) %>% 
  rbind(CTdatabase_cleaned)

3.1.2 Jamin’s list

Excel file coming from supplemental data.

Jamin_core_CT <- read_excel("data/Jamin_core_CT.xlsx")
Jamin_core_CT[Jamin_core_CT$Gene == "KIAA1211", "Gene"] <- "CRACD"
Jamin_core_CT[Jamin_core_CT$Gene == "CXorf67", "Gene"] <- "EZHIP"

3.1.3 Wang’s CTatlas

Excel file coming from supplemental data.

Wang_CT <- read_excel("data/Wang_Suppl_Data_3.xlsx", 
    sheet = "Supplementary Data 3B", skip = 1)
colnames(Wang_CT)[1] <- "ensembl_gene_id"

Wang_CT <- ensembl_gene_names %>% 
  filter(ensembl_gene_id %in% Wang_CT$ensembl_gene_id) %>%
  right_join(Wang_CT)

Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000181013", "external_gene_name"] <- "C17orf47"
Wang_CT[Wang_CT$ensembl_gene_id == "ENSG00000204293", "external_gene_name"] <- "OR8B2"
Wang_CT[Wang_CT$external_gene_name == "", "external_gene_name"] <- "RNASE11"
Wang_CT[Wang_CT$external_gene_name == "CHCT1", "external_gene_name"] <- "C17orf64"
Wang_CT[Wang_CT$external_gene_name == "PRSS40A", "external_gene_name"] <- "TISP43"
Wang_CT[Wang_CT$external_gene_name == "TEX56P", "external_gene_name"] <- "C6orf201"
Wang_CT[Wang_CT$external_gene_name == "SLC25A51P4", "external_gene_name"] <- "RP11-113D6.10"
Wang_CT[Wang_CT$external_gene_name == "TCP10L3", "external_gene_name"] <- "TCP10"
Wang_CT[Wang_CT$external_gene_name == "SCAND3", "external_gene_name"] <- "ZBED9"

3.1.4 Carter’s list

Carter_CT_list <- read_table("data/Carter_CT_list.txt", skip = 1)
Carter_CT <- Carter_CT_list[Carter_CT_list$CT_Expression,]

Carter_CT[Carter_CT$Gene == "ENSG00000261649", "Gene_Name"] <- "GOLGA6L7"
Carter_CT[Carter_CT$Gene == "ENSG00000239620", "Gene_Name"] <- "PRR20G"
Carter_CT[Carter_CT$Gene == "ENSG00000168148", "Gene_Name"] <- "H3-4"
Carter_CT[Carter_CT$Gene == "ENSG00000204296", "Gene_Name"] <- "TSBP1"
Carter_CT[Carter_CT$Gene == "ENSG00000180219", "Gene_Name"] <- "GARIN6"
Carter_CT[Carter_CT$Gene == "ENSG00000172717", "Gene_Name"] <- "GARIN2"
Carter_CT[Carter_CT$Gene == "ENSG00000174015", "Gene_Name"] <- "CBY2"
Carter_CT[Carter_CT$Gene == "ENSG00000224960", "Gene_Name"] <- "PPP4R3C"

3.1.5 Burggeman’s list

Excel file from supplemental data.

Bruggeman_data <- read_excel("data/Bruggeman_suppl_data.xlsx", skip = 1,
                           sheet = "1D")

Bruggeman_official_names <- gene_synonym %>% 
  dplyr::select(external_gene_name) %>% 
  unique() %>% 
  filter(external_gene_name %in% Bruggeman_data$Gene) %>%
  mutate(Gene_Name = external_gene_name) %>%
  mutate(external_synonym = NA)

Bruggeman_synonym <- gene_synonym %>%
  filter(external_synonym %in% Bruggeman_data$Gene) %>%
  mutate(Gene_Name = external_synonym) %>%
  dplyr::select(external_gene_name, Gene_Name, external_synonym)

Bruggeman_synonym <- Bruggeman_synonym[-which(Bruggeman_synonym$Gene_Name %in% 
                          Bruggeman_official_names$Gene_Name),]

Bruggeman_CT <- rbind(Bruggeman_official_names, Bruggeman_synonym)

lost <- Bruggeman_data[which(!Bruggeman_data$Gene %in% c(Bruggeman_CT$Gene_Name)), "Gene"]
colnames(lost) <- "external_gene_name"
lost$Gene_Name <- rep(NA, nrow(lost))
lost$external_synonym <- rep(NA, nrow(lost))

lost[lost$external_gene_name == "C21orf59", "Gene_Name"] <- "CFAP298"
lost[lost$external_gene_name == "C11orf57", "Gene_Name"] <- "NKAPD1"
lost[lost$external_gene_name == "C7orf55", "Gene_Name"] <- "FMC1"
lost[lost$external_gene_name == "C10orf12", "Gene_Name"] <- "LCOR"
lost[lost$external_gene_name == "RPL19P12", "Gene_Name"] <- "RPL19P12"
lost[lost$external_gene_name == "C16orf59", "Gene_Name"] <- "TEDC2"
lost[lost$external_gene_name == "TTTY15", "Gene_Name"] <- "USP9Y"
lost[lost$external_gene_name == "C17orf53", "Gene_Name"] <- "HROB"
lost[lost$external_gene_name == "C1orf112", "Gene_Name"] <- "FIRRM"
lost[lost$external_gene_name == "C12orf66", "Gene_Name"] <- "KICS2"
lost[lost$external_gene_name == "C9orf84", "Gene_Name"] <- "SHOC1"
lost[lost$external_gene_name == "C10orf25", "Gene_Name"] <- "ZNF22-AS1"
lost[lost$external_gene_name == "C20orf197", "Gene_Name"] <- "LINC02910"
lost[lost$external_gene_name == "C3orf67", "Gene_Name"] <- "CFAP20DC"
lost[lost$external_gene_name == "C8orf37", "Gene_Name"] <- "CFAP418"
lost[lost$external_gene_name == "C22orf34", "Gene_Name"] <- "MIR3667HG"
  
Bruggeman_CT <- rbind(Bruggeman_CT, lost) 

missing_Bruggeman <- c("BMS1P4", "ADAM6", "ANXA2P3", "ARHGAP11B", "DPY19L2P2", 
                       "HLA-L", "PA2G4P4", "PIPSL", "PRKY", "YBX3P1", 
                       "RPL23AP53", "UQCRBP1", "RPL23P8", "MRS2P2", "PIN4P1", 
                       "SLC6A10P", "GUSBP2", "PPIEL", "LRRC37BP1", "MSL3P1", 
                       "PLEKHA8P1", "STAG3L1", "TCAM1P", "ZNF702P", "ZNF815P", 
                       "ATP6AP1L", "RPL21P44", "SEC14L1P1", "ZNF876P", 
                       "RPLP0P2", "FAM86JP", "FAM175A", "LACE1", "ATP5EP2", 
                       "WDR92", "TCTE3", "METTL20", "KIAA2022", "ZNRD1-AS1", 
                       "SGOL1", "FAM35DP", "MTL5", "TMEM14E", "MLLT4-AS1", 
                       "CCDC173", "KIAA1524", "WDR78", "LINC00476", "LYRM5", 
                       "HILS1", "CASC5", "KIAA1919", "CTAGE5", "FAM188B", 
                       "TMEM194B", "FAM122C", "PPP1R2P3", "KIAA0391", "SGOL2", 
                       "FAM19A3", "ZNF788", "RPL19P12", "FIRRM")

external_names_to_keep <- gene_synonym %>%
   filter(external_synonym %in% missing_Bruggeman) %>%
   filter(!external_gene_name %in% c("ATP5F1EP2", "POLR1HASP", "SHLD2P3", 
                                   "TMEM14EP", "H1-9P", "ZNF788P")) %>% 
   mutate(Gene_Name = external_gene_name)
 
Bruggeman_CT[Bruggeman_CT$external_synonym %in% 
                    external_names_to_keep$external_synonym, 
                  "Gene_Name"] <- external_names_to_keep$Gene_Name

Bruggeman_CT <- Bruggeman_CT %>% 
  dplyr::select(Gene_Name)

3.2 CTexploreR data for selection pipeline

To characterise the differences between our database and other, we need the category we created in CTexploreR. For this, we have the object all_genes in CTdata that contains the CT analysis for all genes. More info in

Hereunder is what we used for our selection pipeline (coming from make_all_genes_prelim.R and 130_make_all_genes_and_CT_genes.R in CTdata).

all_genes

From there, we filtered based on the testis_specificity (“testis_specific”, which is based on expression in health tissue and scRNA seq info from HPA), CCLE_category (“activated”) and TCGA_category (“activated” or “multimapping_issue”) to have our CT genes. Then, when wanting to validate TSS manually, we realised that for some genes, reads were not properly aligned to exons which might reflect a poorly defined transcription in these regions and are hence likely unreliable.

Some genes were also characterized as Cancer-Testis preferential genes when testis specificity was less stringent

3.3 CTexploreR VS CTdatabase

CTdatabase_ours <- Venn(list(CTdatabase = CTdatabase_cleaned$external_gene_name,
                             CTexploreR = CT_genes$external_gene_name))
gp <- VennThemes(compute.Venn(CTdatabase_ours))
gp[["Face"]][["11"]]$fill <-  "mistyrose"
gp[["Face"]][["01"]]$fill <-  "darkseagreen1"
gp[["Face"]][["10"]]$fill <-  "lightsteelblue1"
gp[["Set"]][["Set1"]]$col <-  "paleturquoise4"
gp[["Set"]][["Set2"]]$col <-  "darkseagreen4"
gp[["SetText"]][["Set1"]]$col <-  "paleturquoise4"
gp[["SetText"]][["Set2"]]$col <-  "darkseagreen4"
plot(CTdatabase_ours, gp = gp)

We find 29.0322581 % of CTdatabase in CTexploreR, which is 40.9090909% of our database.

Lost genes analysis

CTdatabase_lost <- all_genes %>%
  filter(external_gene_name %in% CTdatabase_ours@IntersectionSets[["10"]])

# 9 Genes are lost because not in any database

table(CTdatabase_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                  70                  54                  41
table(CTdatabase_lost$CT_gene_type)
## 
## CTP_gene    other 
##       15      150
table(CTdatabase_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##    51   106
table(CTdatabase_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                 34                 83                 25                 23
table(CTdatabase_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##            81            33            51
table(CTdatabase_lost$TCGA_category, CTdatabase_lost$CCLE_category)
##                     
##                      activated leaky not_activated
##   activated                 20     0            14
##   leaky                     41    33             9
##   multimapping_issue        15     0            10
##   not_activated              5     0            18
CTdatabase_lost_upset <- 
  list(`Not testis specific` = 
         filter(CTdatabase_lost,
                testis_specificity != "testis_specific")$external_gene_name,
       `Not tumour activated` = 
         filter(CTdatabase_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(CTdatabase_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_CTdatabase <- fromList(CTdatabase_lost_upset)
upset(upset_CTdatabase)

 (nrow(CTdatabase_lost) - table(CTdatabase_lost$testis_specificity)[["testis_specific"]])/nrow(CTdatabase_lost)*100
## [1] 75.15152

75.1515152 % of these genes are not testis specific.

However 15 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

87.8787879 % are not properly activated in tumors and/or cancer cell lines.

In their analysis, they had characterised gene specificity, some being not available, not found in testis, testis-restricted, testis-selective and testis/brain-restricted. Let’s see how the lost genes qualify as they didn’t mention those were strictly testis specific.

CTdatabase_cleaned %>% 
  filter(external_gene_name %in% CTdatabase_lost$external_gene_name) %>% 
  pull(Classification) %>% 
  table()
## .
##           not available     not found in testis       testis-restricted 
##                      90                       2                      15 
##        testis-selective testis/brain-restricted 
##                      51                       7

We can see that most of them had no info or were testis-selective (I couldn’t find on website or paper how they selected categories).

3.4 CTexploreR VS omics databases

core_ours <- Venn(list(Jamin = Jamin_core_CT$Gene, 
                       CTexploreR = CT_genes$external_gene_name))

Wang_ours <- Venn(list(Wang = Wang_CT$external_gene_name, 
                       CTexploreR = CT_genes$external_gene_name))

Carter_ours <- Venn(list(Carter_CT = Carter_CT$Gene_Name, 
                         CTexploreR = CT_genes$external_gene_name))

Bruggeman_ours <- Venn(list(Bruggeman = Bruggeman_CT$Gene_Name,
                            CTexploreR = CT_genes$external_gene_name))

gene_list <- list(CTexploreR = CT_genes$external_gene_name,
                  Carter = Carter_CT$Gene_Name,
                  Jamin = Jamin_core_CT$Gene, 
                  CTatlas = Wang_CT$external_gene_name,
                  Bruggeman = Bruggeman_CT$Gene_Name)

upset_omics <- fromList(gene_list)
upset(upset_omics)

4 in all, 60 in at least 3 databases

Lost genes analysis

plot(core_ours, gp = gp)

Jamin_lost <- all_genes %>%
  filter(external_gene_name %in% core_ours@IntersectionSets[["10"]])

table(Jamin_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                  70                  20                   2
table(Jamin_lost$CT_gene_type)
## 
## CTP_gene    other 
##       10       82
table(Jamin_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##    44    48
table(Jamin_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                  8                 63                 17                  4
table(Jamin_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##            49            39             4
table(Jamin_lost$TCGA_category, Jamin_lost$CCLE_category)
##                     
##                      activated leaky not_activated
##   activated                  8     0             0
##   leaky                     22    39             2
##   multimapping_issue        17     0             0
##   not_activated              2     0             2
Jamin_lost_upset <- 
  list(`Not testis specific` = 
         filter(Jamin_lost,
                testis_specificity != "testis_specific")$external_gene_name,
       `Not tumour activated` = 
         filter(Jamin_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Jamin_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Jamin <- fromList(Jamin_lost_upset)
upset(upset_Jamin)

We find 23.2 % of CTdatabase in CTexploreR, which is 16.4772727 % of our database.

97.826087% of these genes are not testis specific.

However 10 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

91.3043478 % are not properly activated in tumors and/or cancer cell lines.

plot(Wang_ours, gp = gp)

Wang_lost <- all_genes %>%
  filter(external_gene_name %in% Wang_ours@IntersectionSets[["10"]])


table(Wang_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                 336                 348                 229
table(Wang_lost$CT_gene_type)
## 
## CTP_gene    other 
##       40      873
table(Wang_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##   254   628
table(Wang_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                191                474                 66                182
table(Wang_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##           342           206           365
table(Wang_lost$TCGA_category, Wang_lost$CCLE_category)
##                     
##                      activated leaky not_activated
##   activated                 88     3           100
##   leaky                    188   201            85
##   multimapping_issue        34     0            32
##   not_activated             32     2           148
Wang_lost_upset <- 
  list(`Not testis specific` = 
         filter(Wang_lost,
                testis_specificity != "testis_specific")$external_gene_name,
       `Not tumour activated` = 
         filter(Wang_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Wang_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Wang <- fromList(Wang_lost_upset)
upset(upset_Wang)

We find 9.0284593 % of CTdatabase in CTexploreR, which is 52.2727273 % of our database.

74.9178532% of these genes are not testis specific.

However 40 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

90.3614458 % are not properly activated in tumors and/or cancer cell lines.

plot(Carter_ours, gp = gp)

Carter_lost <- all_genes %>%
  filter(external_gene_name %in% Carter_ours@IntersectionSets[["10"]])

table(Carter_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                   1                  22                  34
table(Carter_lost$CT_gene_type)
## 
## CTP_gene    other 
##        6       51
table(Carter_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##    13    44
table(Carter_lost$TCGA_category)
## 
##     activated         leaky not_activated 
##            28            14            15
table(Carter_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##            19             3            35
table(Carter_lost$TCGA_category, Carter_lost$CCLE_category)
##                
##                 activated leaky not_activated
##   activated             9     0            19
##   leaky                 6     3             5
##   not_activated         4     0            11
Carter_lost_upset <- 
  list(`Not testis specific` = 
         filter(Carter_lost,
                testis_specificity != "testis_specific")$external_gene_name,
       `Not tumour activated` = 
         filter(Carter_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Carter_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Carter <- fromList(Carter_lost_upset)
upset(upset_Carter)

We find 38.8349515 % of CTdatabase in CTexploreR, which is 22.7272727 % of our database.

40.3508772% of these genes are not testis specific.

However 6 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

84.2105263 % are not properly activated in tumors and/or cancer cell lines.

plot(Bruggeman_ours, gp = gp)

Bruggeman_lost <- all_genes %>%
  filter(external_gene_name %in% Bruggeman_ours@IntersectionSets[["10"]])

table(Bruggeman_lost$testis_specificity)
## 
## not_testis_specific testis_preferential     testis_specific 
##                 627                  61                  11
table(Bruggeman_lost$CT_gene_type)
## 
## CTP_gene    other 
##        6      693
table(Bruggeman_lost$not_detected_in_somatic_HPA)
## 
## FALSE  TRUE 
##   441   234
table(Bruggeman_lost$TCGA_category)
## 
##          activated              leaky multimapping_issue      not_activated 
##                 46                624                 11                 18
table(Bruggeman_lost$CCLE_category)
## 
##     activated         leaky not_activated 
##           242           428            29
table(Bruggeman_lost$TCGA_category, Bruggeman_lost$CCLE_category)
##                     
##                      activated leaky not_activated
##   activated                 32     3            11
##   leaky                    193   418            13
##   multimapping_issue         9     0             2
##   not_activated              8     7             3
Bruggeman_lost_upset <- 
  list(`Not testis specific` = 
         filter(Bruggeman_lost,
                testis_specificity != "testis_specific")$external_gene_name,
       `Not tumour activated` = 
         filter(Bruggeman_lost,
                (TCGA_category != "activated" &
                  TCGA_category != "multimapping_issue")|
                   CCLE_category != "activated")$external_gene_name,
       `CT preferential` =
         filter(Bruggeman_lost,
                CT_gene_type == "CTP_gene")$external_gene_name)

upset_Bruggeman <- fromList(Bruggeman_lost_upset)
upset(upset_Bruggeman)

We find 1.7195767 % of CTdatabase in CTexploreR, which is 7.3863636 % of our database.

98.4263233% of these genes are not testis specific.

However 6 of these lost genes are flagged as Cancer-Testis preferential in our analysis.

95.4220315 % are not properly activated in tumors and/or cancer cell lines.

3.5 Characterisation of differences with all databases

common <- unique(c(core_ours@IntersectionSets[["11"]], 
                   CTdatabase_ours@IntersectionSets[["11"]], 
                   Wang_ours@IntersectionSets[["11"]], 
                   Carter_ours@IntersectionSets[["11"]],
                   Bruggeman_ours@IntersectionSets[["11"]]))

length(common)
## [1] 119
length(common)/dim(CT_genes)[1] * 100
## [1] 67.61364
lost_list <- unique(c(core_ours@IntersectionSets[["10"]],
                      CTdatabase_ours@IntersectionSets[["10"]],
                      Wang_ours@IntersectionSets[["10"]],
                      Carter_ours@IntersectionSets[["10"]],
                      Bruggeman_ours@IntersectionSets[["10"]]))

lost <- all_genes %>%
  filter(external_gene_name %in% lost_list)


not_specific <- filter(lost, testis_specificity == "not_testis_specific")

GTEX_expression(not_specific$external_gene_name, units = "log_TPM")
## Error in readRDS(file) : error reading from connection
## Warning: 5 out of 989 names invalid: FIRRM, CFAP96, SPMIP5, LIAT1, BLTP2.
## See the manual page for valid types.

somatic_testis <- filter(lost, not_detected_in_somatic_HPA == FALSE)

testis_expression(somatic_testis$external_gene_name, cells = "all")
## Warning: 11 out of 710 names invalid: MPL, EDAR, ASB14, F2RL2, PCDHGB2, FOXO3B,
## KRT33B, SEMG1, ADORA2A, DAZ2, DAZ4.
## See the manual page for valid types.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

HPA_cell_type_expression(somatic_testis$external_gene_name)

not_TCGA_activated <- filter(lost, TCGA_category != "activated" & 
                               TCGA_category != "multimapping_issue")

TCGA_expression(not_TCGA_activated$external_gene_name,
                tumor = "all",
                units = "log_TPM")
## Warning: 16 out of 1283 names invalid: FIRRM, CIMIP2C, CFAP96, SPMIP10, SPMIP4,
## SPMIP7, SPATA31F1, SPMIP5, CIMAP1A, SAXO4, CIMAP1C, LIAT1, BLTP2,
## SPMAP2, CIMAP1D, SAXO5.
## See the manual page for valid types.
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

not_CCLE_activated <- filter(lost, CCLE_category  != "activated")

CCLE_expression(not_CCLE_activated$external_gene_name,
                  type = c("lung", "skin", "colorectal",
                           "gastric", "breast", "head_and_neck"),
                units = "log_TPM")
## Warning: 11 out of 1034 names invalid: FIRRM, SPMIP10, SPMIP4, SPATA31F1,
## CIMIP2A, CIMAP1A, SAXO4, CIMAP1C, LIAT1, BLTP2, CIMAP1D.
## See the manual page for valid types.

transcript_prob <- lost %>% 
  filter(testis_specificity == "testis_specific" |
           testis_specificity == "testis_preferential") %>%
  filter(TCGA_category == "activated" | TCGA_category == "multimapping_issue") %>% 
  filter(CCLE_category == "activated") %>% 
  dim()

119 genes in our CTexploreR database are found in at least one of the other database, which represents 67.6136364%.

We have lost 1693 genes in total. Among them, 58.4170112% are not considered testis specific, 41.9373892% are expressed in somatic cells, 75.7826344% are not activated in TCGA samples, 61.0750148% are not activated in CCLE cell lines and 3.4258712% is lost due to transcripts problems.

What about new genes in CTexploreR

new <- CT_genes %>% 
  filter(!external_gene_name%in%common)

new
table(new$testis_specificity)
## 
## testis_specific 
##              57
table(new$X_linked)
## 
## FALSE  TRUE 
##    46    11
table(new$regulated_by_methylation)
## 
## FALSE  TRUE 
##    23    34
table(new$X_linked, new$regulated_by_methylation)
##        
##         FALSE TRUE
##   FALSE    23   23
##   TRUE      0   11
TCGA_expression(tumor = "all", genes = new$external_gene_name, 
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

TCGA_expression(tumor = "all", 
                genes = filter(new, X_linked & regulated_by_methylation)$external_gene_name, 
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

There are 57 new CT genes in CTexploreR. These are all testis specific and mainly on autosomes. Regulation by methylation is the majority of them. There is only 11 new “major” CT that are on the X chromosome and regulated by methylation. CT45 are not that new.

Expression in tumours doesn’t strike that much.

4 CT genes characteristics

table(CT_genes$testis_specificity)
## 
## testis_specific 
##             176
table(CT_genes$transcript_biotype)
## 
##         lncRNA protein_coding 
##             34            142

Most genes are testis specific (100%). Most genes are mainly protein coding genes (80.6818182%).

4.1 X chromosome and regulated by methylation

In CTexploreR, genes have been characterised as regulated by methylation or not.

table(CT_genes$X_linked)
## 
## FALSE  TRUE 
##    98    78
table(CT_genes$regulated_by_methylation)
## 
## FALSE  TRUE 
##    47   129
table(CT_genes$X_linked, CT_genes$testis_specificity)
##        
##         testis_specific
##   FALSE              98
##   TRUE               78
table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
##        
##         FALSE TRUE
##   FALSE    43   55
##   TRUE      4   74

Genes are enriched on the X chromosome (44.3181818%). Also, 129 genes have been flagged as regulated by methylation (73.2954545%). It is interesting to study the link between these two characteristics.

On the chromosome X, there is a clear enrichment of CT genes regulated by methylation (74/78 chrX genes or 74/129 genes regulated by methylation).

Let’s check that with a statistical test, I here want to see if, on there is an enrichment of genes regulated by methylation on the X chromosome. I need to do a Pearson Chi square test (to know if observed proportion differ from expected proportion). It is a statistical method to determine if two categorical variables have a significant correlation between them. I can directly put a matrix (my table) in the function

chisq.test(table(CT_genes$X_linked, CT_genes$regulated_by_methylation))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(CT_genes$X_linked, CT_genes$regulated_by_methylation)
## X-squared = 31.367, df = 1, p-value = 2.135e-08
CT_genes$chr_factor <- factor(CT_genes$chr,
                       levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9",
                                  "10", "11", "12", "13", "14", "15", "16", "17", 
                                  "18", "19", "20", "21", "22", "X", "Y"))

CT_genes %>% 
  mutate(regulated_by_methylation = ifelse(regulated_by_methylation,
                                           "Regulated by methylation",
                                           "Not regulated by methylation")) %>% 
  ggplot(aes(x = chr_factor, fill = X_linked)) +
  geom_bar(stat = 'count') +
  scale_fill_manual(values = c("royalblue1", "deeppink")) +
  facet_wrap(~ regulated_by_methylation, ncol = 2) +
  theme_bw() +
  xlab("Chromosome") +
  ylab("Number of genes") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        legend.position = "none",
        axis.title = element_text(size = 10, color = "gray10"))

There is indeed a significative link between regulation by methylation and being on the X chromosome. There is thus an enrichment of CT genes regulated by methylation on the X chromosome (and inversely).

4.2 Tumour and methylation

Using CTexploreR functions, we can explore all CT genes or focus on some potential targets.

4.2.1 All CT

For these heatmaps, the code comes from the function but has been copies to add some annotations.

chr_mat <- as.matrix(CT_genes$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes$external_gene_name
row_ha_chr <- rowAnnotation(chr_factor = chr_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr_factor = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")



regulation_mat <- as.matrix(CT_genes$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
                         "Not methylation")
rownames(regulation_mat) <- CT_genes$external_gene_name


row_ha_reg <- rowAnnotation(regulation = regulation_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))

split <- data.frame(CT_genes$regulated_by_methylation, CT_genes$X_linked)
database <- TCGA_TPM
database$tumor <- sub(pattern = 'TCGA-', x = database$project_id, '')
database$type <- "Tumor"
database$type[database$shortLetterCode == "NT"] <- "Peritumoral"
database <- database[, order(database$tumor, database$type)]

genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]
database <- database[, database$type == "Tumor"]

column_ha_tumor <- HeatmapAnnotation(
  Tumor = database$tumor,
  border = TRUE,
  col = list(Tumor = c("BRCA" = "midnightblue", "COAD" = "darkorchid2",
                 "ESCA" = "gold", "HNSC" = "deeppink2",
                 "LUAD" = "seagreen", "LUSC" = "seagreen3",
                 "SKCM" = "red3")),
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)

split_by <- factor(database$tumor)
col_annot <- column_ha_tumor

mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name


Heatmap(mat[, , drop = FALSE],
        name = "logTPM",
        column_title = paste0("Expression in TCGA samples (all)"),
        column_split = split_by,
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        col = colorRamp2(seq(0, max(mat), length = 11),
                         legend_colors),
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        show_column_names = FALSE,
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        heatmap_legend_param = legends_param,
        top_annotation = col_annot,
        left_annotation = left_annot)

database <- CCLE_data
database$type <- tolower(database$type)
genes <- CT_genes$external_gene_name
database <- database[rowData(database)$external_gene_name %in% genes, ]
database <- database[match(genes, rowData(database)$external_gene_name), ]

mat <- log1p(assay(database))
rownames(mat) <- rowData(database)$external_gene_name

df_col <- data.frame("cell_line" = colData(database)$cell_line_name,
                     "type" = as.factor(colData(database)$type))
rownames(df_col) <- rownames(colData(database))
df_col <- df_col[order(df_col$type), ]

column_ha_type <- HeatmapAnnotation(
  type = df_col$type,
  border = TRUE,
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param,
  col = list(type = c("lung" = "seagreen3", "skin" = "red3",
                 "bile_duct" = "mediumpurple1", "bladder" = "mistyrose2",
                 "colorectal" = "plum", "lymphoma" = "steelblue1",
                 "uterine" = "darkorange4", "myeloma" = "turquoise3", 
                 "kidney" = "thistle4",
                 "pancreatic" = "darkmagenta", "brain" = "palegreen2",
                 "gastric" = "wheat3", "breast" = "midnightblue",
                 "bone" = "sienna1", "head_and_neck" = "deeppink2",
                 "ovarian" = "tan3", "sarcoma" = "lightcoral",
                 "leukemia" = "steelblue4", "esophageal"= "khaki",
                 "neuroblastoma" = "olivedrab1")))



Heatmap(mat[, rownames(df_col), drop = FALSE],
        name = "logTPM",
        column_title = "Gene Expression in tumor cell lines (CCLE)",
        column_split = factor(df_col$type),
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        col = colorRamp2(seq(0, max(mat), length = 11),
                          legend_colors),
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        show_row_dend = FALSE,
        show_column_names = FALSE,
        cluster_columns = TRUE,
        show_column_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        heatmap_legend_param = legends_param,
        top_annotation = c(column_ha_type),
        left_annotation = left_annot)

Next graph was removed from paper

genes <- CT_genes$external_gene_name
database <- mean_methylation_in_tissues[rownames(mean_methylation_in_tissues) %in% genes]

mat <- na.omit(assay(database))
clustering_option <- TRUE

row_ha_chr_meth <- rowAnnotation(chr = chr_mat[rownames(mat),],
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")


row_ha_reg_meth <- rowAnnotation(regulation = regulation_mat[rownames(mat),],
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot_meth <- c(row_ha_chr_meth, row_ha_reg_meth, gap = unit(1, "mm"))

split_meth <- data.frame(filter(CT_genes, external_gene_name %in% rownames(mat))$regulated_by_methylation, 
                    filter(CT_genes, external_gene_name %in% rownames(mat))$X_linked)

Heatmap(mat,
        column_title = 'Promoter mean methylation level by tissue',
        name = 'Meth',
        col = colorRamp2(c(1:100),
                         colorRampPalette(c("moccasin","dodgerblue4"))(100)),
        na_col = "gray80",
        cluster_rows = clustering_option,
        cluster_columns = FALSE,
        row_split = split_meth,
        row_title_gp = gpar(fontsize = 0),
        show_row_names = TRUE,
        show_heatmap_legend = TRUE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 3),
        column_names_gp = gpar(fontsize = 8),
        column_names_side = "bottom",
        row_names_side = "right",
        left_annotation = left_annot_meth)

4.2.2 MAGE genes

MAGE_genes <- filter(CT_genes, family == "MAGE")$external_gene_name

TCGA_expression(tumor = "all", 
                genes = MAGE_genes,
                units = "log_TPM")
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
## 
## Set `ht_opt$message = FALSE` to turn off this message.

CCLE_expression(genes = MAGE_genes,
                 type = c("lung", "skin", "bile_duct", "bladder", 
                          "colorectal", "lymphoma", "uterine",
                          "myeloma", "kidney", "pancreatic", "brain", 
                          "gastric", "breast", "bone", "head_and_neck", 
                          "ovarian", "sarcoma", "leukemia", "esophageal",
                          "neuroblastoma"), units = "log_TPM")

normal_tissues_mean_methylation(MAGE_genes, na.omit = TRUE)

4.3 Single Cell analysis : timing of expression

In CTexploreR, we there is single cell data from the testis, we thus can analyse CT genes expression during spermatogenesis.

There is only data for 137 of our 176 CT genes.

NB : SSC, Spermatogonia and early spermatocytes are premeiotic cells. Late spermatocytes (between both meiosis), round spermatid, elongated spermatid and sperm are postmeiotic cells.

genes_avail <- 
  CT_genes$external_gene_name[CT_genes$external_gene_name %in% unique(rownames(testis_sce))]
table(CT_genes$testis_cell_type)
## Warning: Unknown or uninitialised column: `testis_cell_type`.
## < table of extent 0 >
table(CT_genes$testis_cell_type)/length(genes_avail)*100
## Warning: Unknown or uninitialised column: `testis_cell_type`.
## numeric(0)

NA % of genes are mainly expressed pre-meioticly.

germ_cells <- c("SSC", "Spermatogonia", "Early_spermatocyte",
                "Late_spermatocyte","Round_spermatid", "Elongated_spermatid",
                "Sperm1", "Sperm2")
somatic_cells <- c("Macrophage", "Endothelial", "Myoid", "Sertoli", "Leydig")

testis_sce_CT <- testis_sce[genes_avail, ]
  
mat <- SingleCellExperiment::logcounts(testis_sce_CT)

df_col <- data.frame(clusters = colData(testis_sce_CT)$clusters,
                     type = colData(testis_sce_CT)$type,
                     Donor = colData(testis_sce_CT)$Donor)
rownames(df_col) <- colnames(testis_sce_CT)

df_col <- df_col[order(df_col$type),]
df_col$lineage <- "Germ cells"
df_col$lineage[df_col$type %in% somatic_cells] <- "Somatic cells"
    
column_ha_type = HeatmapAnnotation(
  type = df_col$type,
  border = TRUE,
  col = list(type = c("SSC" = "floralwhite", "Spermatogonia" = "moccasin",
                   "Early_spermatocyte" = "gold", 
                   "Late_spermatocyte" = "orange",
                   "Round_spermatid" = "red2", 
                   "Elongated_spermatid" = "darkred",
                   "Sperm1" = "violet", "Sperm2" = "purple", 
                   "Sertoli" = "gray", 
                   "Leydig" = "cadetblue2", "Myoid" = "springgreen3", 
                   "Macrophage" = "gray10",
                   "Endothelial" = "steelblue")),
  
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)
    
column_ha_lineage = HeatmapAnnotation(
  lineage = df_col$lineage,
  border = TRUE,
  col = list(lineage = c("Germ cells" = "salmon", "Somatic cells" = "cyan4")),
  annotation_name_gp = gpar(fontsize = 8),
  annotation_legend_param = legends_param)

scale_lims <- c(0, quantile(rowMax(mat), 0.75))
top_annot <- c(column_ha_lineage, column_ha_type)

# Until here is what's in the function, hereunder is my addition/change in Heatmap()

CT_genes_avail <- filter(CT_genes, external_gene_name %in% genes_avail)

chr_mat <- as.matrix(CT_genes_avail$X_linked)
chr_mat <- ifelse(chr_mat == TRUE, "X-linked", "Not X")
rownames(chr_mat) <- CT_genes_avail$external_gene_name
row_ha_chr <- rowAnnotation(chr = chr_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(chr = chr_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")


regulation_mat <- as.matrix(CT_genes_avail$regulated_by_methylation)
regulation_mat <- ifelse(regulation_mat == TRUE, "Methylation",
                         "Not methylation")
rownames(regulation_mat) <- CT_genes_avail$external_gene_name


row_ha_reg <- rowAnnotation(regulation = regulation_mat,
                            annotation_legend_param = legends_param,
                            simple_anno_size = unit(0.5, "cm"),
                            col = list(regulation = meth_colors),
                            annotation_name_gp = gpar(fontsize = 8),
                            annotation_name_side = "top")

left_annot <- c(row_ha_chr, row_ha_reg, gap = unit(1, "mm"))

split <- data.frame(CT_genes_avail$regulated_by_methylation, CT_genes_avail$X_linked)
    
Heatmap(mat[genes_avail, rownames(df_col), drop = FALSE],
        name = "logCounts",
        column_title = "Expression in testis cells (scRNAseq)",
        column_split = df_col$type,
        row_split = split,
        row_title_gp = gpar(fontsize = 0),
        show_column_names = FALSE,
        show_column_dend = FALSE,
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        cluster_rows = TRUE,
        cluster_columns = FALSE,
        show_row_dend = FALSE,
        row_names_gp = gpar(fontsize = 4),
        col = colorRamp2(seq(scale_lims[1], scale_lims[2], length = 11),                 
                         legend_colors),
        top_annotation = top_annot,
        left_annotation = left_annot,
        heatmap_legend_param = legends_param)

4.4 Gene function

All CT genes function

msigdbr(species = "Homo sapiens" , category = "C5") %>% 
  filter(gene_symbol %in% CT_genes$external_gene_name) %>% 
  pull(gene_symbol) %>% 
  unique() %>% 
  length()
## [1] 106
msigdbr(species = "Homo sapiens" , category = "H") %>% 
  filter(gene_symbol %in% CT_genes$external_gene_name) %>% 
  pull(gene_symbol) %>% 
  unique() %>% 
  length()
## [1] 7
go_ora <- enrichGO(gene = CT_genes$ensembl_gene_id,
                   keyType = "ENSEMBL",
                   OrgDb = org.Hs.eg.db,
                   ont = "all",
                   readable = TRUE)
as_tibble(go_ora)
as_tibble(go_ora) %>% 
  arrange(desc(Count)) %>% 
  head(12) %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/177,
                           ONTOLOGY == "CC"~ Count/197,
                           ONTOLOGY == "MF"~ Count/186)) %>% 
  ggplot(aes(x = Ratio, y = Description, fill = Description)) +
  geom_col() +
  theme_bw() +
  ylab("GO term") +
  xlab("Gene Ratio") +
  theme(axis.text.y = element_blank(),
        legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = 10, color = "gray10"))+ 
  geom_text(aes(0, y = Description, label = Description),
            hjust = 0,
            nudge_x = 0.005,
            colour = "floralwhite",
            size = 4)

ora_to_plot <- as_tibble(simplify(go_ora)) 

ora_to_plot <- ora_to_plot %>% 
  arrange(desc(Count)) %>% 
  head(9) %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/102,
                           ONTOLOGY == "MF"~ Count/186))
  
       
ora_to_plot %>% 
  ggplot(aes(x = Ratio, y = Description, fill = Description)) +
  geom_col() +
  theme_bw() +
  ylab("GO term") +
  xlab("Gene Ratio") +
  theme(axis.text.y = element_blank(),
        legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = 10, color = "gray10"))+ 
  geom_text(aes(0, y = Description, label = Description),
            hjust = 0,
            nudge_x = 0.005,
            colour = "floralwhite",
            size = 3.7)

As we can see here, most of genes are indeed linked to functions from reproduction. I represented here the 12 categories with the most genes, all from biological processes. However, they are enriched in 54 different GO terms.

Is there a difference between meth reg or not ?

go_ora_meth <- enrichGO(gene = 
                          filter(CT_genes, regulated_by_methylation)$ensembl_gene_id,
                        keyType = "ENSEMBL",
                        OrgDb = org.Hs.eg.db,
                        ont = "all",
                        readable = TRUE)
go_ora_meth <- simplify(go_ora_meth)
as_tibble(go_ora_meth)
go_ora_not_meth <- enrichGO(gene = 
                          filter(CT_genes, !regulated_by_methylation)$ensembl_gene_id,
                        keyType = "ENSEMBL",
                        OrgDb = org.Hs.eg.db,
                        ont = "all",
                        readable = TRUE)
go_ora_not_meth <- simplify(go_ora_not_meth)
as_tibble(go_ora_not_meth)
go_ora_meth_plot <- as_tibble(go_ora_meth) %>% 
  arrange(desc(Count)) %>% 
  head(10) %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/73,
                           ONTOLOGY == "MF"~ Count/80,
                           ONTOLOGY == "CC"~ Count/78)) %>%  
  mutate(regulation = "Methylation")

go_ora_not_meth_plot <- as_tibble(go_ora_not_meth) %>% 
  arrange(desc(Count)) %>% 
  head(8) %>% 
  mutate(Ratio = case_when(ONTOLOGY == "BP"~ Count/29,
                           ONTOLOGY == "MF"~ Count/30))%>%  
  mutate(regulation = "Not methylation")


rbind(go_ora_meth_plot, go_ora_not_meth_plot) %>% 
  ggplot(aes(x = Ratio, y = Description, fill = Description)) +
  geom_col() +
  theme_bw() +
  ylab("GO term") +
  xlab("Gene Ratio") +
  facet_wrap(~ regulation) + 
  theme(legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size = 6, color = "gray10"),
        axis.title.x = element_text(size = 10, color = "gray10"),
        axis.title.y = element_blank())+ 
  geom_text(aes(0, y = Description, label = ID),
            hjust = 0,
            nudge_x = 0.005,
            colour = "floralwhite",
            size = 2)

We’ve also added a column tumor suppressor and oncogene in CTexploreR. These information come from Cancermine.

table(CT_genes$oncogene) 
## 
## oncogene 
##       23
table(CT_genes$tumor_suppressor)
## 
## tumor_suppressor 
##                6
table(CT_genes$oncogene, CT_genes$tumor_suppressor) 
##           
##            tumor_suppressor
##   oncogene                4

5 SessionInfo

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] Biostrings_2.70.3           XVector_0.42.0             
##  [3] patchwork_1.2.0             BiocParallel_1.36.0        
##  [5] DOSE_3.28.2                 msigdbr_7.5.1              
##  [7] clusterProfiler_4.10.1      org.Hs.eg.db_3.18.0        
##  [9] AnnotationDbi_1.64.1        SingleCellExperiment_1.24.0
## [11] circlize_0.4.16             ComplexHeatmap_2.18.0      
## [13] UpSetR_1.4.0                SummarizedExperiment_1.32.0
## [15] Biobase_2.62.0              GenomicRanges_1.54.1       
## [17] GenomeInfoDb_1.38.7         IRanges_2.36.0             
## [19] S4Vectors_0.40.2            MatrixGenerics_1.14.0      
## [21] matrixStats_1.2.0           lubridate_1.9.3            
## [23] forcats_1.0.0               stringr_1.5.1              
## [25] dplyr_1.1.4                 purrr_1.0.2                
## [27] tidyr_1.3.1                 tibble_3.2.1               
## [29] ggplot2_3.5.0               tidyverse_2.0.0            
## [31] biomaRt_2.58.2              Vennerable_3.0             
## [33] xtable_1.8-4                gtools_3.9.5               
## [35] reshape_0.8.9               RColorBrewer_1.1-3         
## [37] lattice_0.22-5              RBGL_1.78.0                
## [39] graph_1.80.0                BiocGenerics_0.48.1        
## [41] CTexploreR_0.99.5           CTdata_1.2.0               
## [43] readr_2.1.5                 readxl_1.4.3               
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.2                 later_1.3.2                  
##   [3] ggplotify_0.1.2               bitops_1.0-7                 
##   [5] filelock_1.0.3                cellranger_1.1.0             
##   [7] polyclip_1.10-6               XML_3.99-0.16.1              
##   [9] lifecycle_1.0.4               doParallel_1.0.17            
##  [11] vroom_1.6.5                   MASS_7.3-60.0.1              
##  [13] magrittr_2.0.3                sass_0.4.9                   
##  [15] rmarkdown_2.26                jquerylib_0.1.4              
##  [17] yaml_2.3.8                    httpuv_1.6.14                
##  [19] cowplot_1.1.3                 DBI_1.2.2                    
##  [21] abind_1.4-5                   zlibbioc_1.48.2              
##  [23] ggraph_2.2.1                  RCurl_1.98-1.14              
##  [25] yulab.utils_0.1.4             tweenr_2.0.3                 
##  [27] rappdirs_0.3.3                GenomeInfoDbData_1.2.11      
##  [29] enrichplot_1.22.0             ggrepel_0.9.5                
##  [31] tidytree_0.4.6                codetools_0.2-19             
##  [33] DelayedArray_0.28.0           xml2_1.3.6                   
##  [35] ggforce_0.4.2                 tidyselect_1.2.1             
##  [37] shape_1.4.6.1                 aplot_0.2.2                  
##  [39] farver_2.1.1                  viridis_0.6.5                
##  [41] BiocFileCache_2.10.1          jsonlite_1.8.8               
##  [43] GetoptLong_1.0.5              ellipsis_0.3.2               
##  [45] tidygraph_1.3.1               iterators_1.0.14             
##  [47] foreach_1.5.2                 tools_4.3.2                  
##  [49] progress_1.2.3                treeio_1.26.0                
##  [51] Rcpp_1.0.12                   glue_1.7.0                   
##  [53] gridExtra_2.3                 SparseArray_1.2.4            
##  [55] xfun_0.42                     qvalue_2.34.0                
##  [57] withr_3.0.0                   BiocManager_1.30.22          
##  [59] fastmap_1.1.1                 fansi_1.0.6                  
##  [61] digest_0.6.35                 gridGraphics_0.5-1           
##  [63] timechange_0.3.0              R6_2.5.1                     
##  [65] mime_0.12                     colorspace_2.1-0             
##  [67] Cairo_1.6-2                   GO.db_3.18.0                 
##  [69] RSQLite_2.3.5                 utf8_1.2.4                   
##  [71] generics_0.1.3                data.table_1.15.2            
##  [73] prettyunits_1.2.0             graphlayouts_1.1.1           
##  [75] httr_1.4.7                    S4Arrays_1.2.1               
##  [77] scatterpie_0.2.1              pkgconfig_2.0.3              
##  [79] gtable_0.3.4                  blob_1.2.4                   
##  [81] shadowtext_0.1.3              htmltools_0.5.7              
##  [83] fgsea_1.28.0                  clue_0.3-65                  
##  [85] scales_1.3.0                  png_0.1-8                    
##  [87] ggfun_0.1.4                   knitr_1.45                   
##  [89] rstudioapi_0.15.0             tzdb_0.4.0                   
##  [91] reshape2_1.4.4                rjson_0.2.21                 
##  [93] nlme_3.1-164                  curl_5.2.1                   
##  [95] cachem_1.0.8                  GlobalOptions_0.1.2          
##  [97] BiocVersion_3.18.1            parallel_4.3.2               
##  [99] HDO.db_0.99.1                 pillar_1.9.0                 
## [101] vctrs_0.6.5                   promises_1.2.1               
## [103] dbplyr_2.5.0                  cluster_2.1.6                
## [105] evaluate_0.23                 magick_2.8.3                 
## [107] cli_3.6.2                     compiler_4.3.2               
## [109] rlang_1.1.3                   crayon_1.5.2                 
## [111] labeling_0.4.3                plyr_1.8.9                   
## [113] fs_1.6.3                      stringi_1.8.3                
## [115] viridisLite_0.4.2             babelgene_22.9               
## [117] munsell_0.5.0                 lazyeval_0.2.2               
## [119] GOSemSim_2.28.1               Matrix_1.6-5                 
## [121] ExperimentHub_2.10.0          hms_1.1.3                    
## [123] bit64_4.0.5                   KEGGREST_1.42.0              
## [125] shiny_1.8.0                   highr_0.10                   
## [127] interactiveDisplayBase_1.40.0 AnnotationHub_3.10.0         
## [129] igraph_2.0.3                  memoise_2.0.1                
## [131] bslib_0.6.1                   ggtree_3.10.1                
## [133] fastmatch_1.1-4               bit_4.0.5                    
## [135] gson_0.1.0                    ape_5.7-1